Data Visualization and Statistics¶

Libraries and Configurations¶

Import configuration files

In [ ]:
from configparser import ConfigParser

config = ConfigParser()
config.read("../config.ini")
Out[ ]:
['../config.ini']

Import data libraries

In [ ]:
import pandas as pd

# Import label encoder
from sklearn import preprocessing

Import other libraries

In [ ]:
from rich.progress import Progress
from rich import traceback

traceback.install()
Out[ ]:
<bound method InteractiveShell.excepthook of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x105bb9390>>

Custom helper scripts

In [ ]:
%cd ..
from scripts import plotHelper, encodingHelper
%cd data_exploration_cleaning
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/.venv/lib/python3.11/site-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library.
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks/data_exploration_cleaning
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/.venv/lib/python3.11/site-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library.
  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]

Import Data¶

In [ ]:
# Combined dataframe raw
combined_df_csv = config["DEFAULT"]["interim_path"] + "combined_df_balanced_encoded.csv"
In [ ]:
combined_df = pd.read_csv(combined_df_csv, index_col=0)

Fixing columns data types

In [ ]:
combined_df.dtypes
Out[ ]:
Timestamp                    object
MAC Address                  object
Channel                       int64
DS Channel                  float64
HT Capabilities               int64
Extended Capabilities         int64
Vendor Specific Tags          int64
SSID                         object
Supported Rates               int64
Extended Supported Rates      int64
VHT Capabilities              int64
HE Capabilities               int64
Length                        int64
Label                        object
dtype: object
In [ ]:
# Converting Timestamp to datetime
combined_df["Timestamp"] = pd.to_datetime(combined_df["Timestamp"])

# Converting Label to string
combined_df["Label"] = combined_df["Label"].astype(str)

# Converting SSID to string
combined_df["SSID"] = combined_df["SSID"].astype(str)

# Converting MAC Address to string
combined_df["MAC Address"] = combined_df["MAC Address"].astype(str)

# Converting HT Capabilities to string
combined_df["HT Capabilities"] = combined_df["HT Capabilities"].astype(str)

# Converting Extended Capabilities to string
combined_df["Extended Capabilities"] = combined_df["Extended Capabilities"].astype(str)

# Converting Vendor Specific Tags to string
combined_df["Vendor Specific Tags"] = combined_df["Vendor Specific Tags"].astype(str)

# Converting Supported Rates to string
combined_df["Supported Rates"] = combined_df["Supported Rates"].astype(str)

# Converting Extended Supported Rates to string
combined_df["Extended Supported Rates"] = combined_df[
    "Extended Supported Rates"
].astype(str)

# Converting VHT Capabilities to string
combined_df["VHT Capabilities"] = combined_df["VHT Capabilities"].astype(str)

# Converting HE Capabilities to string
combined_df["HE Capabilities"] = combined_df["HE Capabilities"].astype(str)

Data Visualization¶

Data Distribution¶

In [ ]:
combined_df
Out[ ]:
Timestamp MAC Address Channel DS Channel HT Capabilities Extended Capabilities Vendor Specific Tags SSID Supported Rates Extended Supported Rates VHT Capabilities HE Capabilities Length Label
0 2023-05-20 13:52:01.864465952 d2:6b:aa:b5:fb:ed 1 1.0 6 17 -1 -1 2 0 -1 62 135 iPhone12Pro_C
1 2023-05-20 13:52:01.884716034 d2:6b:aa:b5:fb:ed 1 1.0 6 17 -1 -1 2 0 -1 62 135 iPhone12Pro_C
2 2023-05-20 13:52:01.910542011 d2:6b:aa:b5:fb:ed 6 6.0 6 17 -1 -1 2 0 -1 62 135 iPhone12Pro_C
3 2023-05-20 13:52:01.930788994 d2:6b:aa:b5:fb:ed 6 6.0 6 17 -1 -1 2 0 -1 62 135 iPhone12Pro_C
4 2023-05-20 13:52:01.968745947 d2:6b:aa:b5:fb:ed 11 11.0 6 17 -1 -1 2 0 -1 62 135 iPhone12Pro_C
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
13939 2021-07-07 12:02:57.579541922 da:a1:19:00:17:f9 6 1.0 8 7 25 -1 0 0 0 13 182 XiaomiRedmiNote7_S
13940 2021-07-07 11:32:04.533828019 da:a1:19:1a:cc:8f 6 8.0 8 7 25 Wind3 HUB-6D1619 0 0 0 14 198 XiaomiRedmiNote7_S
13941 2021-07-07 11:46:50.089955091 da:a1:19:41:c9:b1 11 5.0 8 7 25 -1 0 0 0 32 143 XiaomiRedmiNote7_S
13942 2021-07-07 12:16:31.309731960 da:a1:19:c7:24:b1 1 3.0 8 7 25 -1 0 0 0 14 182 XiaomiRedmiNote7_S
13943 2021-07-07 11:19:07.014556885 da:a1:19:05:11:80 1 5.0 8 7 25 Wind3 HUB-6D1619 0 0 0 14 198 XiaomiRedmiNote7_S

36046 rows × 14 columns

Number of Probe Requests entries per device

In [ ]:
plotHelper.plot_label_distribution(combined_df, "Label")
No description has been provided for this image

Packet Length¶

In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "Length")
No description has been provided for this image
In [ ]:
plotHelper.plot_boxplot(combined_df, "Label", "Length")
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks/scripts/plotHelper.py:170: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
No description has been provided for this image

SSIDs¶

Percentage of SSIDs disclosed in Probe Requests.

In [ ]:
plotHelper.plot_pie_chart(combined_df, "SSID")
No description has been provided for this image
In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "SSID")
No description has been provided for this image
In [ ]:
print(
    combined_df[combined_df["SSID"] != "-1"]["Label"].nunique(),
    "devices disclosing SSID out of",
    combined_df["Label"].nunique(),
    "->",
    round(
        combined_df[combined_df["SSID"] != "-1"]["Label"].nunique()
        / combined_df["Label"].nunique()
        * 100,
        2,
    ),
    "%",
)
7 devices disclosing SSID out of 33 -> 21.21 %

Channel Utilization¶

Channel usage per device

In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "Channel")
No description has been provided for this image

DS Channel usage per device

In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "DS Channel")
No description has been provided for this image

DS Channel parameter distribution on actual Channel frequencies.

In [ ]:
plotHelper.plot_heatmap(combined_df, "Channel", "DS Channel")
No description has been provided for this image

Plotting Channel usage per single device

In [ ]:
plotHelper.plot_multi_pie_charts(combined_df, "Label", "Channel")
No description has been provided for this image
In [ ]:
plotHelper.plot_multi_pie_charts(combined_df, "Label", "DS Channel")
No description has been provided for this image
In [ ]:
plotHelper.plot_boxplot(combined_df, "Label", "Channel")
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks/scripts/plotHelper.py:170: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
No description has been provided for this image
In [ ]:
plotHelper.plot_boxplot(combined_df, "Label", "DS Channel")
/Users/bacci/Library/CloudStorage/SynologyDrive-giovanni/Research 🌱/Repositories/COMPACT/notebooks/scripts/plotHelper.py:170: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
No description has been provided for this image

Information Elements¶

HT Capabilities¶

In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "HT Capabilities")
No description has been provided for this image
In [ ]:
plotHelper.plot_pie_chart(combined_df, "HT Capabilities", other_percentage=0.01)
No description has been provided for this image

HE Capabilities¶

In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "HE Capabilities")
No description has been provided for this image
In [ ]:
plotHelper.plot_pie_chart(combined_df, "HE Capabilities")
No description has been provided for this image

Supported Rates¶

In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "Supported Rates")
No description has been provided for this image
In [ ]:
plotHelper.plot_pie_chart(combined_df, "Supported Rates", other_percentage=0.01)
No description has been provided for this image

Extended Supported Rates¶

In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "Extended Supported Rates")
No description has been provided for this image
In [ ]:
plotHelper.plot_pie_chart(combined_df, "Extended Supported Rates")
No description has been provided for this image

Vendor Specific Tags¶

In [ ]:
plotHelper.plot_heatmap(combined_df, "Vendor Specific Tags", "Label")
No description has been provided for this image
In [ ]:
plotHelper.plot_pie_chart(combined_df, "Vendor Specific Tags", other_percentage=0.03)
No description has been provided for this image

Extended Capabilities¶

In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "Extended Capabilities")
No description has been provided for this image
In [ ]:
plotHelper.plot_pie_chart(combined_df, "Extended Capabilities", other_percentage=0.03)
No description has been provided for this image

VHT Capabilities¶

In [ ]:
plotHelper.plot_heatmap(combined_df, "Label", "VHT Capabilities")
No description has been provided for this image
In [ ]:
plotHelper.plot_pie_chart(combined_df, "VHT Capabilities")
No description has been provided for this image